library(readxl)
library(readr)
library(dplyr)
library(here)
library(tidyverse)
library(stringr)
sheet1 <- read_xlsx(here("raw_data/boing-boing-candy-2015.xlsx"))
sheet1
dim(sheet1)
colnames(sheet1)
glimpse(sheet1)
sheet1_long <- sheet1 %>%
pivot_longer(cols = starts_with("["), #converting rows to columns
names_to = "candy",
values_to = "rating")
#check dimensions, column names
dim(sheet1_long)
colnames(sheet1_long)
#Omitting columns that are not required
sheet1_long <- sheet1_long %>%
select(-c(4:29))
colnames(sheet1_long)
dim(sheet1_long)
colnames(sheet1_long) <- c("timestamp", "age", "going_out", "candy", "rating")
colnames(sheet1_long)
[1] "timestamp" "age" "going_out" "candy" "rating"
#Add column ‘year’
sheet1_long <- sheet1_long %>%
add_column(year = 2015)
head(sheet1_long)
sheet2 <- read_xlsx(here("raw_data/boing-boing-candy-2016.xlsx"))
sheet2
dim(sheet2)
colnames(sheet2)
glimpse(sheet2)
sheet2_long <- sheet2 %>%
pivot_longer(cols = starts_with("["), #converting rows to columns
names_to = "candy",
values_to = "rating")
dim(sheet2_long)
colnames(sheet2_long)
#Omitting columns that are not required
sheet2_long <- sheet2_long %>%
select(-c(6:22))
colnames(sheet2_long)
[1] "Timestamp" "Are you going actually going trick or treating yourself?"
[3] "Your gender:" "How old are you?"
[5] "Which country do you live in?" "candy"
[7] "rating"
dim(sheet2_long)
[1] 127159 7
colnames(sheet2_long) <- c("timestamp", "going_out", "gender", "age", "country", "candy", "rating")
colnames(sheet2_long)
[1] "timestamp" "going_out" "gender" "age" "country" "candy" "rating"
#Add column ‘year’
sheet2_long <- sheet2_long %>%
add_column(year = 2016)
head(sheet2_long)
distinct(sheet2_long, country)
sheet3 <- read_xlsx(here("raw_data/boing-boing-candy-2017.xlsx"))
New names:
* `` -> ...114
sheet3
dim(sheet3)
colnames(sheet3)
glimpse(sheet3)
sheet3_long <- sheet3 %>%
pivot_longer(cols = starts_with("Q6"), #converting rows to columns
names_to = "candy",
values_to = "rating")
colnames(sheet3_long)
[1] "Internal ID" "Q1: GOING OUT?" "Q2: GENDER" "Q3: AGE"
[5] "Q4: COUNTRY" "Q5: STATE, PROVINCE, COUNTY, ETC" "Q7: JOY OTHER" "Q8: DESPAIR OTHER"
[9] "Q9: OTHER COMMENTS" "Q10: DRESS" "...114" "Q11: DAY"
[13] "Q12: MEDIA [Daily Dish]" "Q12: MEDIA [Science]" "Q12: MEDIA [ESPN]" "Q12: MEDIA [Yahoo]"
[17] "Click Coordinates (x, y)" "candy" "rating"
#Omitting columns that are not required
sheet3_long <- sheet3_long %>%
select(-c(6:17))
colnames(sheet3_long)
[1] "Internal ID" "Q1: GOING OUT?" "Q2: GENDER" "Q3: AGE" "Q4: COUNTRY" "candy" "rating"
dim(sheet3_long)
[1] 253380 7
#Change col names
colnames(sheet3_long) <- c("internal_id", "going_out", "gender", "age", "country", "candy", "rating")
colnames(sheet3_long)
[1] "internal_id" "going_out" "gender" "age" "country" "candy" "rating"
sheet3_long
#summarise((count = sum(is.na(rating))))
#Add column ‘year’
sheet3_long <- sheet3_long %>%
add_column(year = 2017)
head(sheet3_long)
distinct(sheet3_long, country)
#X X X X X X X X X X X X X X X X X X X X X X X X X X X X X X
candy_combined <- bind_rows(sheet1_long, sheet2_long, sheet3_long)
colnames(candy_combined)
[1] "timestamp" "age" "going_out" "candy" "rating" "year" "gender" "country" "internal_id"
dim(candy_combined)
[1] 915389 9
head(candy_combined)
#check
distinct(candy_combined, going_out)
#Changing column data type
candy_combined$age <- as.integer(candy_combined$age)
NAs introduced by coercionNAs introduced by coercion to integer range
candy_combined$year <- as.integer(candy_combined$year)
candy_combined$internal_id <- as.integer(candy_combined$internal_id)
head(candy_combined)
NA
candy_combined <- candy_combined %>%
filter(age <= 122)
length(candy_combined$age)
[1] 869160
distinct(candy_combined, age)
candy_combined <- candy_combined %>%
mutate_if(is.character, str_to_lower)
head(candy_combined)
unique(candy_combined$country)
[1] NA
[2] "canada"
[3] "usa"
[4] "us"
[5] "uk"
[6] "united states of america"
[7] "japan"
[8] "united states"
[9] "france"
[10] "ussa"
[11] "u.s.a."
[12] "england"
[13] "switzerland"
[14] "murica"
[15] "united kingdom"
[16] "neverland"
[17] "usa!"
[18] "korea"
[19] "u.s."
[20] "america"
[21] "units states"
[22] "belgium"
[23] "croatia"
[24] "portugal"
[25] "usa usa usa"
[26] "the best one - usa"
[27] "usa! usa! usa!"
[28] "españa"
[29] "there isn't one for old men"
[30] "panama"
[31] "the yoo ess of aaayyyyyy"
[32] "united kindom"
[33] "hungary"
[34] "austria"
[35] "new zealand"
[36] "germany"
[37] "mexico"
[38] "australia"
[39] "brasil"
[40] "god's country"
[41] "south korea"
[42] "usa!!!!!!"
[43] "philippines"
[44] "eua"
[45] "usa! usa!"
[46] "sweden"
[47] "united sates"
[48] "the netherlands"
[49] "finland"
[50] "merica"
[51] "china"
[52] "kenya"
[53] "the republic of cascadia"
[54] "united stetes"
[55] "usa usa usa usa"
[56] "united states of america"
[57] "netherlands"
[58] "united state"
[59] "united staes"
[60] "uae"
[61] "usausausa"
[62] "unhinged states"
[63] "us of a"
[64] "unites states"
[65] "the united states"
[66] "north carolina"
[67] "unied states"
[68] "europe"
[69] "earth"
[70] "u s"
[71] "u.k."
[72] "costa rica"
[73] "the united states of america"
[74] "unite states"
[75] "cascadia"
[76] "greece"
[77] "usa? hard to tell anymore.."
[78] "'merica"
[79] "usas"
[80] "pittsburgh"
[81] "a"
[82] "can"
[83] "canae"
[84] "new york"
[85] "trumpistan"
[86] "ireland"
[87] "california"
[88] "south africa"
[89] "i pretend to be from canada, but i am really from the united states."
[90] "iceland"
[91] "canada`"
[92] "scotland"
[93] "denmark"
[94] "united stated"
[95] "ud"
[96] "new jersey"
[97] "indonesia"
[98] "united ststes"
[99] "united statss"
[100] "endland"
[101] "murrika"
[102] "usaa"
[103] "soviet canuckistan"
[104] "n. america"
[105] "singapore"
[106] "taiwan"
[107] "hong kong"
[108] "spain"
[109] "narnia"
[110] "u s a"
[111] "united statea"
[112] "1"
[113] "usa usa usa!!!!"
[114] "i don't know anymore"
country_distinct <- distinct(candy_combined, country)
candy_combined <- candy_combined %>%
mutate(candy = str_extract(candy, "[^\\[\\]]+"),
candy = str_extract(candy, "[^q6 |]+[^|]+"))
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
1.4.2 Analysis questions 1. What is the total number of candy ratings given across the three years. (number of candy ratings, not number of raters. Don’t count missing values)
candy_combined %>%
filter(!is.na(rating)) %>%
summarise(total_rating_count = n())
#total_candy_ratings
candy_combined %>%
select(going_out, age) %>%
filter(!is.na(going_out)) %>%
group_by(going_out) %>%
summarise(average_age = mean(age))
`summarise()` ungrouping output (override with `.groups` argument)
candy_combined %>%
select(candy, rating) %>%
filter(!is.na(rating)) %>%
group_by(rating, candy) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
slice(seq_len(1))
`summarise()` regrouping output by 'rating' (override with `.groups` argument)
distinct(candy_combined, candy)
candy_combined %>%
select(candy, rating) %>%
filter(candy == "starburst", rating == "despair") %>%
group_by(rating) %>%
summarise(starburst_count = n()) #%>%
`summarise()` ungrouping output (override with `.groups` argument)
#arrange(desc(count)) #%>%
#slice(seq_len(3))
For the next three questions, count despair as -1, joy as +1 and meh as 0.
candy_combined %>%
select(gender, candy, rating) %>%
filter(!is.na(rating)) %>%
mutate(new_rating = case_when(rating == "despair" ~ -1,
rating == "joy" ~ 1,
rating == "meh" ~ 0,)) %>%
group_by(gender, candy) %>%
summarise(count_popular = sum(new_rating)) %>%
arrange(desc(count_popular)) %>%
slice(seq_len(1))
`summarise()` regrouping output by 'gender' (override with `.groups` argument)
candy_combined %>%
select(year, candy, rating) %>%
filter(!is.na(rating)) %>%
mutate(new_rating = case_when(rating == "despair" ~ -1,
rating == "joy" ~ 1,
rating == "meh" ~ 0,)) %>%
group_by(year, candy) %>%
summarise(popular_candy = sum(new_rating)) %>%
arrange(desc(popular_candy)) %>%
slice(seq_len(1))
`summarise()` regrouping output by 'year' (override with `.groups` argument)
TEST XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
sheet3_long %>%
mutate(new_candy3= str_extract(candy, "[^Q6 |]+[^|]+"))
new_candy3
Error: object 'new_candy3' not found
sheet2_long %>%
mutate(new_candy2 = str_extract(candy, "[^\\[\\]]+"))
new_candy2
Error: object 'new_candy2' not found